
#### Configuration #####
library(dplyr)
library(randomForestSRC)
library(xgboost)
library(MLmetrics)
library(caret)

options(scipen=999)
setwd("E:/Fortis/Workspace/Views Competition/Update September 2020")
# setwd("C:/Users/Felix Ettensperger/Desktop/Views Competition/Backup")
memory.size(32000)


Ensemble.Stacking.Task2.s3.fin <- read.csv("Ensemble_Stacking_Task2_training_data_S3.csv", header=TRUE)


### fix wrong ln_ged_change
####

views.ged <- read.csv("views_ged.csv", header=TRUE)
views.ged.pred.t2  <- filter(views.ged, month_id >= 114 & month_id <= 441)


views.ged.pred.t2[,c(2:3,58)] -> merge.true.t2.s3
merge.true.t2.s3$month_id+3 -> merge.true.t2.s3$month_id
merge.true.t2.s3$ln_ged_best_sb_s3 -> merge.true.t2.s3$real
merge.true.t2.s3$ln_ged_best_sb_s3 <- NULL

Ensemble.Stacking.Task2.s3.fin.fixed <- left_join(Ensemble.Stacking.Task2.s3.fin, merge.true.t2.s3, by=c("month_id", "country_id"))
Ensemble.Stacking.Task2.s3.fin.fixed$ln_ged_best_sb_s3 <- NULL

## rm(Ensemble.Stacking.Task2.s3.fin.fixed)


Ensemble.Stacking.Task2.s3.fin.fixed <- rename(Ensemble.Stacking.Task2.s3.fin.fixed, "RF.United.t2.s3" = "RF.t2.united.s3") 
write.csv(Ensemble.Stacking.Task2.s3.fin.fixed, file="Ensemble_Stacking_T2_S3_training_data_07_07_2021.csv", row.names=FALSE)

#### NOPE

## series 2 is without scaling in XGB VAriants due to high MSE scores?
# Ensemble.Stacking.Task2.s3.series2 <- read.csv("Ensemble_Stacking_Task2_training_data_S3_series2.csv", header=TRUE)

## series 3 train stacking algorithm only on African cases
# Ensemble.Stacking.Task2.s3.series3 <- read.csv("Ensemble_Stacking_Task2_training_data_S3_series3.csv", header=TRUE)


## predictions reaching into test dataset t2 are deleted
# Ensemble.Stacking.Task2.s3.fin[1:60491,] -> Ensemble.Stacking.Task2.s3.fin
# Ensemble.Stacking.Task2.s3.series2[1:60491,] -> Ensemble.Stacking.Task2.s3.fin
# Ensemble.Stacking.Task2.s3.series3[1:16786,] -> Ensemble.Stacking.Task2.s3.fin


Ensemble.Stacking.Task2.s3.fin <- Ensemble.Stacking.Task2.s3.fin.fixed


### unweighted ensemble

Ensemble.Stacking.Task2.s3.fin$unw.ensemble <- rowMeans(Ensemble.Stacking.Task2.s3.fin[,4:9])

MSE(Ensemble.Stacking.Task2.s3.fin$unw.ensemble, Ensemble.Stacking.Task2.s3.fin$real) ## 0.1

MSE(Ensemble.Stacking.Task2.s3.fin$RF.GED.t2.s3, Ensemble.Stacking.Task2.s3.fin$real)  ## 0.16
MSE(Ensemble.Stacking.Task2.s3.fin$XGB.GED.t2.s3, Ensemble.Stacking.Task2.s3.fin$real) ## 0.05
MSE(Ensemble.Stacking.Task2.s3.fin$RF.United.t2.s3, Ensemble.Stacking.Task2.s3.fin$real) ## 0.11
MSE(Ensemble.Stacking.Task2.s3.fin$XGB.United.t2.s3, Ensemble.Stacking.Task2.s3.fin$real) ## 0.012
MSE(Ensemble.Stacking.Task2.s3.fin$RF.GED.t2.s3.africa, Ensemble.Stacking.Task2.s3.fin$real) ## 0.31
MSE(Ensemble.Stacking.Task2.s3.fin$XGB.GED.t2.s3.africa, Ensemble.Stacking.Task2.s3.fin$real) ## 0.26

## now it looks good


#############++++++++++++++++++++++++#################################################################
######################################################################################################
## Correct ####


## test data
Pred.Ensemble.Task2.s3.africa <- read.csv("Pred_Ensemble_Task2_s3_africa.csv", header=TRUE)
Pred.Ensemble.Task3.s3.africa <- read.csv("Pred_Ensemble_Task3_s3_africa.csv", header=TRUE)


Pred.Ensemble.Task2.s3.africa[, c(5:7, 1, 2, 3, 4, 9, 10, 11)] -> Pred.Ensemble.Task2.s3.africa.fin
Pred.Ensemble.Task2.s3.africa.fin$unw.ensemble <- rowMeans(Pred.Ensemble.Task2.s3.africa.fin[,4:9])

Pred.Ensemble.Task3.s3.africa[, c(3:5, 1, 2, 6, 7, 9, 10)] -> Pred.Ensemble.Task3.s3.africa.fin
Pred.Ensemble.Task3.s3.africa.fin$unw.ensemble <- rowMeans(Pred.Ensemble.Task3.s3.africa.fin[,4:9])

## add real values

Pred.Ensemble.Task3.s3.africa.true <- left_join(Pred.Ensemble.Task3.s3.africa.fin, merge.true.t2.s3, by=c("month_id", "country_id"))

MSE(Pred.Ensemble.Task3.s3.africa.true$unw.ensemble, Pred.Ensemble.Task3.s3.africa.true$real)
## correct


## sync colnames 

Pred.Ensemble.Task2.s3.africa.fin[,1:11] -> T2.s3
Pred.Ensemble.Task3.s3.africa.true[,c(1:9,11,10)] -> T3.s3


# Ensemble.Stacking.Task2.s3.fin <- rename(Ensemble.Stacking.Task2.s3.fin, "RF.United.t2.s3"  = "RF.t2.united.s3")
colnames(T2.s3) -> colnames(T3.s3)



## test unw ensemble with test data (looks good)
MSE(T2.s3$unw.ensemble, T2.s3$real)
MSE(T3.s3$unw.ensemble, T3.s3$real)

## stacking forest

stacking.forest <- rfsrc(real ~., Pred.Ensemble.Task2.s3.africa.fin[,4:10], ntree = 550,
                      na.action="na.omit")

stacking.forest


stacking 


### stacking: using T3 test data to predict weighting in t2

stacking.T3 <- rfsrc(real ~., T3.s3[,4:10], ntree = 550,
                     na.action="na.omit")


stacking.T3


stacking.t3.prediction <- predict(stacking.T3, T2.s3[,4:9], na.action = "na.omit")

stacking.t3.prediction[["predicted"]] -> T2.s3$T3.based.stacking.prediction


###test###
MSE(T2.s3$unw.ensemble, T2.s3$real)
MSE(T2.s3$T3.based.stacking.prediction, T2.s3$real)






#####
#####
#####
#####
#####
#####
#####
#####
####
###
##
##
#
#
#


## stacking boost 

train.data.xgb = as.matrix(Ensemble.Stacking.Task2.s3.fin[,4:9])
train.label.xgb = as.matrix(Ensemble.Stacking.Task2.s3.fin[,10])
test.data.xgb = as.matrix(Pred.Ensemble.Task2.s3.africa.fin[,4:9])
test.label.xgb = as.matrix(Pred.Ensemble.Task2.s3.africa.fin[,10])

stacking.boost <- xgboost(data= train.data.xgb, label=train.label.xgb,
                                    objective = "reg:linear", 
                                    eval_metric = "rmse",
                                    max.depth = 25, 
                                    eta = 0.01, 
                                    nround = 950, 
                                    subsample = 0.5, 
                                    colsample_bytree = 0.5
)

###
# k-NN?
stacking.linear.model <- caret::train(Ensemble.Stacking.Task2.s3.fin[,4:9], Ensemble.Stacking.Task2.s3.fin[,10], method='knn')




## predict test set with stacking forest

stacking.forest.prediction <- predict(stacking.forest, Pred.Ensemble.Task2.s3.africa.fin[,4:9], na.action = "na.omit")

stacking.forest.prediction[["predicted"]] -> Pred.Ensemble.Task2.s3.africa.fin$Stacking.prediction


## predict test set with stacking boost

stacking.boost.prediction = predict(stacking.boost, test.data.xgb, reshape=T)
stacking.boost.prediction.frame <- as.data.frame(stacking.boost.prediction)

stacking.boost.prediction.frame$stacking.boost.prediction -> Pred.Ensemble.Task2.s3.africa.fin$Stacking.boost.pred

### predict test with knn stacking

predictions <- predict(object=stacking.linear.model, Pred.Ensemble.Task2.s3.africa.fin[,4:9])
knn.prediction.frame <- as.data.frame(predictions)

knn.prediction.frame$predictions -> Pred.Ensemble.Task2.s3.africa.fin$knn.stacking.prediction







### Eval



MSE(Pred.Ensemble.Task2.s3.africa.fin$unw.ensemble, Pred.Ensemble.Task2.s3.africa.fin$real)
MSE(Pred.Ensemble.Task2.s3.africa.fin$Stacking.prediction, Pred.Ensemble.Task2.s3.africa.fin$real)
MSE(Pred.Ensemble.Task2.s3.africa.fin$Stacking.boost.pred, Pred.Ensemble.Task2.s3.africa.fin$real)
MSE(Pred.Ensemble.Task2.s3.africa.fin$knn.stacking.prediction, Pred.Ensemble.Task2.s3.africa.fin$real)



MSE(Pred.Ensemble.Task2.s3.africa.fin$unw.ensemble, Pred.Ensemble.Task2.s3.africa.fin$Stacking.prediction)



#+#+#++###+++++######+++++###++#+#+#


####### New Idea: load t3 predictions. train algorithm on them. predict t2 predictions stacking

Pred.Ensemble.Task3.s3.africa <- read.csv("Pred_Ensemble_Task3_s3_africa.csv", header=TRUE)
Pred.Ensemble.Task3.s3.africa[, c(3:5, 1, 2, 6, 7, 9, 10)] -> Pred.Ensemble.Task3.s3.africa.fin
Pred.Ensemble.Task3.s3.africa.fin <- left_join(Pred.Ensemble.Task3.s3.africa.fin, merge.true.t2.s3, by=c("month_id", "country_id"))

Pred.Ensemble.Task3.s3.africa.fin$unw.ensemble <- rowMeans(Pred.Ensemble.Task3.s3.africa.fin[,4:9])

## changed order: now not necessary: Pred.Ensemble.Task3.s3.africa.fin$Ensemble.forecast.t3.s3.p1 <- NULL





######## clean load t2 ?
Pred.Ensemble.Task2.s3.africa.fin[,1:11] -> Pred.Ensemble.Task2.s3.africa.series.B



## allign names

colnames(Pred.Ensemble.Task3.s3.africa.fin) -> colnames(Pred.Ensemble.Task2.s3.africa.series.B)
### for predict() variable names must be identical




MSE(Pred.Ensemble.Task3.s3.africa.fin$unw.ensemble, Pred.Ensemble.Task3.s3.africa.fin$real)


stacking.forest.t3 <- rfsrc(real ~., Pred.Ensemble.Task3.s3.africa.fin[,4:10], ntree = 550,
                         na.action="na.omit")

stacking.forest.t3


## predict t2 set with stacking forest

stacking.forest.prediction.t2 <- predict(stacking.forest.t3, Pred.Ensemble.Task2.s3.africa.series.B[,4:9], na.action = "na.omit")


stacking.forest.prediction.t2

stacking.forest.prediction.t2[["predicted"]] -> Pred.Ensemble.Task2.s3.africa.series.B$Stacking.prediction.t3.t2




### eval
MSE(Pred.Ensemble.Task2.s3.africa.series.B$unw.ensemble, Pred.Ensemble.Task2.s3.africa.series.B$real)
MSE(Pred.Ensemble.Task2.s3.africa.series.B$Stacking.prediction.t3.t2, Pred.Ensemble.Task2.s3.africa.series.B$real)


MSE(Pred.Ensemble.Task2.s3.africa.series.B$unw.ensemble, Pred.Ensemble.Task2.s3.africa.series.B$Stacking.prediction.t3.t2)



###### weighted ensemble

Pred.Ensemble.Task2.s3.africa.fin[,1:11] -> W.Ensemble 

MSE(W.Ensemble$RF.GED.t2.s3, W.Ensemble$real)  # 0.57337
MSE(W.Ensemble$XGB.GED.t2.s3, W.Ensemble$real) # 0.59899
MSE(W.Ensemble$RF.United.t2.s3, W.Ensemble$real) # 0.6418669
MSE(W.Ensemble$XGB.United.t2.s3, W.Ensemble$real) # 0.6594203
MSE(W.Ensemble$RF.GED.t2.s3.africa, W.Ensemble$real) # 0.6561


##########
##########
##########
##########

## Task 2
##  S=2
##

#########
#########
#########
#########

# load data

Pred.Ensemble.Task2.s2.africa <- read.csv("Pred_Ensemble_Task2_s2_africa.csv", header=TRUE)
Pred.Ensemble.Task3.s2.africa <- read.csv("Pred_Ensemble_Task3_s2_africa.csv", header=TRUE)

## restructure data t2

Pred.Ensemble.Task2.s2.africa[, c(5:7, 1, 2, 3, 4, 9, 10, 11)] -> T2.s2
T2.s2$unw.ensemble <- rowMeans(T2.s2[,4:9])

## check
MSE(T2.s2$unw.ensemble, T2.s2$real)

## restructure data t3

Pred.Ensemble.Task3.s2.africa[, c(3:5, 1, 2, 6, 7, 9, 10)] -> T3.s2

## add real conflict values
views.ged <- read.csv("views_ged.csv", header=TRUE)

## for s=2 adjust month correctly
views.ged$month_id+2 -> views.ged$month_id
views.ged.join <- dplyr::select(views.ged, month_id, country_id, ln_ged_best_sb_s2)

T3.s2 <- left_join(T3.s2, views.ged.join, by=c("month_id", "country_id"))

T3.s2$unw.ensemble <- rowMeans(T3.s2[,4:9])


## check
MSE(T3.s2$unw.ensemble, T3.s2$ln_ged_best_sb_s2)


## adjust colnames for both datasets to be identical
colnames(T2.s2) -> colnames(T3.s2)


########
## stacking forest
stacking.forest <- rfsrc(real ~., T3.s2[,4:10], ntree = 550,
                         na.action="na.omit")


stacking.forest

### predict t2 ensemble values based on learning the 6 model pattern
stacking.forest.prediction <- predict(stacking.forest, T2.s2[,4:10], na.action = "na.omit")


stacking.forest.prediction

stacking.forest.prediction[["predicted"]] -> T2.s2$RF.stacking.pred
##########

##########
### stacking k-nn
stacking.knn <- caret::train(T3.s2[,4:9], T3.s2[,10], method='knn')
knn.pred <- predict(object=stacking.knn, T2.s2[,4:9])
knn.pred.frame <- as.data.frame(knn.pred)

knn.pred.frame$knn.pred-> T2.s2$knn.stacking.pred


### eval ensemble stacking RF s2
MSE(T2.s2$unw.ensemble, T2.s2$RF.stacking.pred)

MSE(T2.s2$real, T2.s2$RF.stacking.pred)
MSE(T2.s2$real, T2.s2$unw.ensemble)
MSE(T2.s2$real, T2.s2$w.ensemble)
MSE(T2.s2$real, T2.s2$knn.stacking.pred)

## weighted vs unweighted
0.5120223 - 0.5118103
0.5120223/0.5118103


## RF stacked vs unweighted
0.5120223 - 0.5602353 
0.5120223/0.5602353 
1.0-0.9139415

## k-nn stacked vs unweighted
0.5120223 - 0.6104767
0.5120223/0.6104767
1.0-0.8387254


## weighted ensemble
### add weights based on deviation from best forecasting model.

# T3 s=2 forecast quality 

# 0.7201
# 0.7007
# 0.7499
# 0.7149
# 0.7292
# 0.7243

# 0.7007 / 0.7201
# Weight M1 (RF GED glob)= 0.9730593

# Weight M2 (XGB GED glob)= 1.00

# 0.7007 / 0.7499
# Weight M3 (RF United glob)= 0.9343913

# 0.7007 / 0.7149
# Weight M4 (XGB United glob)= 0.9801371

# 0.7007 / 0.7292
# Weight M5 (RF GED Africa)= 0.9609161

# 0.7007 / 0.7243
# Weight M6 (XGB GED Africa)= 0.9674168

# divide by: 0.9730593 + 1.00 + 0.9343913 + 0.9801371 + 0.9609161 + 0.9674168
0.9730593 + 1.00 + 0.9343913 + 0.9801371 + 0.9609161 + 0.9674168

T2.s2$w.ensemble <- ((T2.s2$RF.GED.t2.s2*0.9730593) + (T2.s2$XGB.GED.t2.s2*1.0) + 
                    (T2.s2$RF.United.t2.s2*0.9343913) + (T2.s2$XGB.United.t2.s2*0.9801371) + 
                    (T2.s2$RF.GED.t2.s2.africa*0.9609161) + (T2.s2$XGB.GED.t2.s2.africa*0.9674168)) / 5.815921



### write and save ensemble stacking and weighting results
write.csv(T2.s2, file="T2_s2_stacking_weighting.csv", row.names=FALSE)
save.image("E:/Fortis/Workspace/Views Competition/Update September 2020/T2_s2_stacking_weighting.RData")


##########
##########
##########
##########

## Task 2
##  S=7
##

#########
#########
#########
#########

# clear workspace


# load data

Pred.Ensemble.Task2.s7.africa <- read.csv("Pred_Ensemble_Task2_s7_africa.csv", header=TRUE)
Pred.Ensemble.Task3.s7.africa <- read.csv("Pred_Ensemble_Task3_s7_africa.csv", header=TRUE)

## restructure data t2

Pred.Ensemble.Task2.s7.africa[, c(3:5, 1, 2, 6, 7, 10, 11, 9)] -> T2.s7
T2.s7$unw.ensemble <- rowMeans(T2.s7[,4:9])

## check
MSE(T2.s7$unw.ensemble, T2.s7$ln_ged_best_sb_s7)

## restructure data t3

Pred.Ensemble.Task3.s7.africa[, c(3:5, 1, 2, 6, 7, 9, 10)] -> T3.s7

## add real conflict values
views.ged <- read.csv("views_ged.csv", header=TRUE)

## for s=2 adjust month correctly
views.ged$month_id+7 -> views.ged$month_id
views.ged.join <- dplyr::select(views.ged, month_id, country_id, ln_ged_best_sb_s7)

T3.s7 <- left_join(T3.s7, views.ged.join, by=c("month_id", "country_id"))

T3.s7$unw.ensemble <- rowMeans(T3.s7[,4:9])


## check
MSE(T3.s7$unw.ensemble, T3.s7$ln_ged_best_sb_s7)


## adjust colnames for both datasets to be identical
colnames(T2.s7) -> colnames(T3.s7)


########
## stacking forest
stacking.forest <- rfsrc(ln_ged_best_sb_s7 ~., T3.s7[,4:10], ntree = 550,
                         na.action="na.omit")


stacking.forest

### predict t2 ensemble values based on learning the 6 model pattern
stacking.forest.prediction <- predict(stacking.forest, T2.s7[,4:10], na.action = "na.omit")


stacking.forest.prediction

stacking.forest.prediction[["predicted"]] -> T2.s7$RF.stacking.pred
##########

##########
### stacking k-nn
stacking.knn <- caret::train(T3.s7[,4:9], T3.s7[,10], method='knn')
knn.pred <- predict(object=stacking.knn, T2.s7[,4:9])
knn.pred.frame <- as.data.frame(knn.pred)

knn.pred.frame$knn.pred-> T2.s7$knn.stacking.pred


### eval ensemble stacking RF s2
MSE(T2.s7$unw.ensemble, T2.s7$RF.stacking.pred)

MSE(T2.s7$ln_ged_best_sb_s7, T2.s7$RF.stacking.pred)
MSE(T2.s7$ln_ged_best_sb_s7, T2.s7$unw.ensemble)
MSE(T2.s7$ln_ged_best_sb_s7, T2.s7$w.ensemble)
MSE(T2.s7$ln_ged_best_sb_s7, T2.s7$knn.stacking.pred)

## weighted vs unweighted
0.6348683 - 0.6348048
0.6348683/0.6348048




## RF stacked vs unweighted
0.6348683 - 0.7325502 
0.6348683/0.7325502 
1.0-0.866655

## k-nn stacked vs unweighted
0.6348683 - 0.8096046
0.6348683/0.8096046
1.0-0.7841708




## weighted ensemble
### add weights based on deviation from best forecasting model.

# T3 s=7 forecast quality 

# 0.8821
# 0.9062
# 0.8745
# 0.9038
# 0.9240
# 0.9554

# 0.8745 / 0.8821
# Weight M1 (RF GED glob)= 0.9913842

# 0.8745 / 0.9062
# Weight M2 (XGB GED glob)= 0.9650188

# best forecast -> w=1
# Weight M3 (RF United glob)= 1.00

# 0.8745 / 0.9038
# Weight M4 (XGB United glob)= 0.9675813

# 0.8745 / 0.9240
# Weight M5 (RF GED Africa)= 0.9464286

# 0.8745 / 0.9554
# Weight M6 (XGB GED Africa)= 0.9153234

# divide by: 0.9913842 + 0.9650188 + 1.00 + 0.9675813 + 0.9464286 + 0.9153234
0.9913842 + 0.9650188 + 1.00 + 0.9675813 + 0.9464286 + 0.9153234

T2.s7$w.ensemble <- ((T2.s7$RF.GED.t2.s7*0.9913842) + (T2.s7$XGB.GED.t2.s7*0.9650188) + 
                       (T2.s7$RF.United.t2.s7*1.00) + (T2.s7$XGB.United.t2.s7*0.9675813) + 
                       (T2.s7$RF.GED.t2.s7.africa*0.9464286) + (T2.s7$XGB.GED.t2.s7.africa*0.9153234)) / 5.785736


### write and save ensemble stacking and weighting results
write.csv(T2.s7, file="T2_s7_stacking_weighting.csv", row.names=FALSE)
save.image("E:/Fortis/Workspace/Views Competition/Update September 2020/T2_s7_stacking_weighting.RData")
